Goal: Compare super claims 1, 3, and 5. 1 : Not Happening 3: Climate Impacts Not Bad 5: Science/Scientist Not Reliable
knitr::opts_chunk$set(echo = TRUE)
library(jsonlite) # allows us to read in json files
library(tidyverse) # allows us to do lots of data manipulation and basic data science
library(here) # allows us to cut out long file paths (ex. "users/connor/dowloads/etc")
library(forcats) #
library(tidytext) # allows us to tokenize data
library(dplyr) # allows us to manipulate dataframes
library(stringr) # allows us to count the number of words in a cell
library(quanteda) # allows us to tokenize data
library(quanteda.textplots) # allows us to make network plots
library(gridExtra) # allows us to combine multiple plots into 1
library(wordcloud) # allows us to generate word clouds
library(fmsb)
library(plotly)
library(ggthemes)
library(tm)
Super Claim #1 Not Happening
nature_analysis <- read_csv(here("data/training.csv"))
## Rows: 23436 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): text, claim
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Filter() to select super claim 1
na_1 <- nature_analysis %>%
filter(str_detect(claim, "1_"))
Add word_count column using mutate()
na_1 <- na_1 %>%
mutate(word_count = str_count(na_1$text, "\\S+"))
#Distribution visual, geom_histogram
ggplot(na_1, aes(x = word_count, fill = claim)) +
geom_histogram(bins = 67, color = "black") +
theme_wsj()+
theme(text = element_text(family = "Menlo-Bold", size = 12),
legend.title = element_text(family = "Menlo-Bold", size = 12)) +
labs(title = "Distribution of Claims",
subtitle = "Claim 1")
Tokenize using unnest_tokens() to seprate text into words
na_1_tokenized <- na_1 %>%
unnest_tokens(words, text)
na_1_tokenized <- na_1_tokenized %>%
count(words) %>%
arrange(desc(n))
Filter() out stopwords()
na_1_tokenized <- na_1_tokenized %>%
filter(!words %in% stopwords("english"))
#Word Cloud visual
wordcloud(na_1_tokenized$words, freq = na_1_tokenized$n, max.words = 200, min.freq = 5, random.order = FALSE, colors = brewer.pal(12, "Paired"))
## Warning in wordcloud(na_1_tokenized$words, freq = na_1_tokenized$n, max.words =
## 200, : predictions could not be fit on page. It will not be plotted.
## Warning in wordcloud(na_1_tokenized$words, freq = na_1_tokenized$n, max.words =
## 200, : alarmists could not be fit on page. It will not be plotted.
## Warning in wordcloud(na_1_tokenized$words, freq = na_1_tokenized$n, max.words =
## 200, : minimum could not be fit on page. It will not be plotted.
## Warning in wordcloud(na_1_tokenized$words, freq = na_1_tokenized$n, max.words =
## 200, : recorded could not be fit on page. It will not be plotted.
na_1_corpus <- corpus(na_1$text)
toks <- na_1_corpus %>%
tokens(remove_punct = TRUE) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords("english"), padding = FALSE)
fcmat <- fcm(toks, context = "window", tri = FALSE)
feat <- names(topfeatures(fcmat, 30))
fcm_select(fcmat, pattern = feat) %>%
textplot_network(min_freq = 0.5)
Super Claim 3 Climate Impacts Not Bad FIlter() for super claim 3
na_3 <- nature_analysis %>%
filter(str_detect(claim, "3_"))
Add word_count column using mutate()
na_3 <- na_3 %>%
mutate(word_count = str_count(na_3$text, "\\S+"))
#Distribution visual, geom_histogram
ggplot(na_3, aes(x = word_count, fill = claim)) +
geom_histogram(bins = 67, color = "black") +
theme_wsj()+
theme(text = element_text(family = "Menlo-Bold", size = 12),
legend.title = element_text(family = "Menlo-Bold", size = 12)) +
labs(title = "Distribution of Claims",
subtitle = "Claim 3")
Tokenize using unnest_tokens()
na_3_tokenized <- na_3 %>%
unnest_tokens(words, text)
na_3_tokenized <- na_3_tokenized %>%
count(words) %>%
arrange(desc(n))
Filter() out stopwords()
na_3_tokenized <- na_3_tokenized %>%
anti_join(stop_words, by = c("words" = "word")) %>%
filter(!words %in% c("et", "al", "2"))
#Word cloud visual
wordcloud(na_3_tokenized$words, freq = na_3_tokenized$n, max.words = 200, min.freq = 5, random.order = FALSE, random.color = FALSE, colors = brewer.pal(12, "Paired"))
na_3_corpus <- corpus(na_3$text)
toks <- na_3_corpus %>%
tokens(remove_punct = TRUE) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords("english"), padding = FALSE)
fcmat <- fcm(toks, context = "window", tri = FALSE)
feat <- names(topfeatures(fcmat, 30))
fcm_select(fcmat, pattern = feat) %>%
textplot_network(min_freq = 0.5)
Super Claim 5 Science/Scientist Not Reliable FIlter() for super claim 5
na_5 <- nature_analysis %>%
filter(str_detect(claim, "5_"))
Add word_count column using mutate()
na_5 <- na_5 %>%
mutate(word_count = str_count(na_5$text, "\\S+"))
#Distribution visual, geom_histogram
ggplot(na_5, aes(x = word_count, fill = claim)) +
geom_histogram(bins = 67, color = "black") +
theme_wsj()+
theme(text = element_text(family = "Menlo-Bold", size = 12),
legend.title = element_text(family = "Menlo-Bold", size = 12)) +
labs(title = "Distribution of Claims",
subtitle = "Claim 5")
Tokenize using unnest_tokens()
na_5_tokenzied <- nature_analysis %>%
unnest_tokens(words, text)
na_5_tokenzied <- na_5_tokenzied %>%
count(words) %>%
arrange(desc(n))
Filter() out stopwords()
na_5_tokenzied <- na_5_tokenzied %>%
filter(!words %in% stopwords("english"))
wordcloud(na_5_tokenzied$words, freq = na_5_tokenzied$n, max.words = 200, min.freq = 5, random.order = FALSE, random.color = FALSE, color = brewer.pal(12, "Paired"))
na_5_corpus <- corpus(na_5$text)
toks <- na_5_corpus %>%
tokens(remove_punct = TRUE) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords("english"), padding = FALSE)
fcmat <- fcm(toks, context = "window", tri = FALSE)
feat <- names(topfeatures(fcmat, 30))
fcm_select(fcmat, pattern = feat) %>%
textplot_network(min_freq = 0.5)
na_1_matrix <- as.matrix.data.frame(na_1_tokenized)
na_3_matrix <- as.matrix.data.frame(na_3_tokenized)
na_5_matrix <- as.matrix.data.frame(na_5_tokenzied)
na_1_text <- apply(na_1_matrix, 1, toString)
na_3_text <- apply(na_3_matrix, 1, toString)
na_5_text <- apply(na_5_matrix, 1, toString)
if (require(tm)) {
# Replace the following code with your own text data
texts <- c("na_1_text", "na_3_text", "na_5_text")
# Create a corpus from the text data
corp <- Corpus(VectorSource(texts))
# Preprocess the corpus
#corp <- tm_map(corp, removePunctuation)
#corp <- tm_map(corp, content_transformer(tolower))
#corp <- tm_map(corp, removeNumbers)
#corp <- tm_map(corp, removeWords, stopwords())
# Create the term document matrix
term.matrix <- DocumentTermMatrix(corp)
term.matrix <- as.matrix(term.matrix)
# Assign column names to the matrix
colnames(term.matrix) <- paste0("Document ", 1:ncol(term.matrix))
# Generate the word cloud
comparison.cloud(term.matrix, max.words = 40, random.order = FALSE)
comparison.cloud(term.matrix, max.words = 40, random.order = FALSE,
title.colors = c("red", "blue"), title.bg.colors = c("grey40", "grey70"))
comparison.cloud(term.matrix, max.words = 40, random.order = FALSE,
match.colors = TRUE)
}